/*
 *  NLPFeaturesList.java
 *
 *  Yaoyong Li 22/03/2007
 *
 *  $Id: NLPFeaturesList.java, v 1.0 2007-03-22 12:58:16 +0000 yaoyong $
 */
package gate.learning;

import gate.util.BomStrippingInputStreamReader;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

/**
 * NLP feature list. Read it from a file, update it using the new documents, and
 * write back into the file.
 */
public class NLPFeaturesList {

    /** the features with ids, can be accessed by multiple threads. */
    public Hashtable featuresList = null;
    /**
     * Document frequence of each term, useful for document or passage
     * classification
     */
    public Hashtable idfFeatures = null;
    /** Total number of documents used for forming the list. */
    int totalNumDocs;
    /** The unique sysmbol used for the N-gram feature. */
    public final static String SYMBOLNGARM = "<>";

    /** Constructor, get the two hashtables. */
    public NLPFeaturesList() {
        featuresList = new Hashtable();
        idfFeatures = new Hashtable();
        totalNumDocs = 0;
    }

    /** Loading the list from a file. */
    public void loadFromFile(File parentDir, String filename, String tcode) {
        File fileFeaturesList = new File(parentDir, filename);
        if (fileFeaturesList.exists()) {
            try {
                BufferedReader in = new BomStrippingInputStreamReader(new FileInputStream(fileFeaturesList), tcode);
                // featuresList = new Hashtable();
                String line;
                if ((line = in.readLine()) != null) {
                    totalNumDocs = (new Integer(line.substring(line.lastIndexOf("=") + 1))).intValue();
                }
                while ((line = in.readLine()) != null) {
                    String[] st = line.split(" ");
                    featuresList.put(st[0], st[1]);
                    idfFeatures.put(st[0], st[2]);
                }
                in.close();
            } catch (IOException e) {
            }
        } else {
            if (LogService.maxVerbosityLevel > 0) {
                System.out.println("No feature list file in initialisation phrase.");
            }
        }
    }

    /** Write back the list into the file, with updated information. */
    public void writeListIntoFile(File parentDir, String filename, String tcode) {
        File fileFeaturesList = new File(parentDir, filename);
        if (LogService.maxVerbosityLevel > 1) {
            System.out.println("Lengh of List = " + featuresList.size());
        }
        try {
            PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(
                    fileFeaturesList), tcode));
            // for the total number of docs
            out.println("totalNumDocs=" + totalNumDocs);
            List keys = new ArrayList(featuresList.keySet());
            Collections.sort(keys);
            // write the features list into the output file
            Iterator iterator = keys.iterator();
            while (iterator.hasNext()) {
                Object key = iterator.next();
                out.println(key + " " + featuresList.get(key) + " "
                        + idfFeatures.get(key));
                //System.out.println("*"+key+ "* " + featuresList.get(key));
            }
            out.close();
        } catch (IOException e) {
        }
    }

    /** Update the NLP features from new documents. */
    public void addFeaturesFromDoc(NLPFeaturesOfDoc fd) {
        long size = featuresList.size();
        for (int i = 0; i < fd.numInstances; ++i) {
            String[] features = fd.featuresInLine[i].toString().trim().split(
                    ConstantParameters.ITEMSEPARATOR);
            for (int j = 0; j < features.length; ++j) {
                if (features[j] != null && Pattern.matches((".+\\[[-0-9]+\\]$"), features[j])) {
                    int ind = features[j].lastIndexOf('[');
                    features[j] = features[j].substring(0, ind);
                }
                //if the feature is an empty, don't count it as a feature at all.
                if (features[j].equals("")) {
                    continue;
                }
                String feat = features[j];
                if (feat.contains(SYMBOLNGARM)) {
                    feat = feat.substring(0, feat.lastIndexOf(SYMBOLNGARM));
                }
                if (!feat.equals(ConstantParameters.NAMENONFEATURE)) {
                    // If the featureName is not in the feature list
                    if (size < ConstantParameters.MAXIMUMFEATURES) {
                        if (!featuresList.containsKey(feat)) {
                            ++size;
                            // features is from 1 (not zero), in the SVM-light
                            // format
                            featuresList.put(feat, new Long(size));
                            idfFeatures.put(feat, new Long(1));
                        } else {
                            idfFeatures.put(feat, new Long((new Long(idfFeatures.get(feat).toString())).longValue() + 1));
                        }
                    } else {
                        System.out.println("There are more NLP features from the training docuemnts");
                        System.out.println(" than the pre-defined maximal number"
                                + new Long(ConstantParameters.MAXIMUMFEATURES));
                        return;
                    }
                }
            }
        }// end of the loop on the instances
        // update the total number of docs
        totalNumDocs += fd.numInstances;
    }

    /** Clear the label list object for another run in evaluation. */
    public void clearAllData() {
        featuresList.clear();
        idfFeatures.clear();
    }

    /** convert the NLP list into Ngram language model and write it into a file. */
    public void writeToLM(File parentDir, String filename, int nGram) {
        File ngramList = new File(parentDir, filename);
        if (LogService.maxVerbosityLevel > 1) {
            System.out.println("Lengh of List = " + featuresList.size());
        }
        try {
            PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(
                    ngramList), "UTF-8"));
            // for the total number of docs
            //out.println("totalNumDocs=" + new Integer(totalNumDocs));
            out.println("## The following " + nGram + "-gram were obtained from " + totalNumDocs + " documents or examples");
            List keys = new ArrayList(featuresList.keySet());
            //Collections.sort(keys);
            int numT = keys.size();
            float[] freqs = new float[numT];
            for (int i = 0; i < numT; ++i) {
                freqs[i] = Float.parseFloat(idfFeatures.get(keys.get(i)).toString());
            }
            int[] indexSort = new int[numT];
            LightWeightLearningApi.sortFloatAscIndex(freqs, indexSort, numT, numT);
            // write the features list into the output file
            //Iterator iterator = keys.iterator();
            //while(iterator.hasNext()) {
            for (int i = 0; i < numT; ++i) {
                Object key = keys.get(indexSort[i]);
                String str = key.toString();
                if (str.contains("<>")) { //if it is a ngram feature
                    str = str.substring(str.indexOf("_", 1) + 1, str.lastIndexOf("<>"));
                    out.println(str + " " + (int) freqs[indexSort[i]]);
                    //System.out.println("*"+key+ "* " + featuresList.get(key));
                }
            }
            out.close();
        } catch (IOException e) {
        }
    }
}
